Final Project

Data Science for Biologists, Spring 2021

Naman Srivastava



Introduction

This data comes from the Antimicrobial resistance package titled AMR, which is frequently used for epidemiological investigations. I am particularly interested in the field of antibiotic resistance and this r package contains a wide variety of suitable data that can be used for prediction analyses and data visualization.

AMR Package

glimpse(microorganisms)
## Rows: 70,026
## Columns: 16
## $ mo         <mo> "F_FUNGUS", "B_GRAMN", "B_GRAMP", "UNKNOWN", "F_YEAST", "B_[…
## $ fullname   <chr> "(unknown fungus)", "(unknown Gram-negatives)", "(unknown G…
## $ kingdom    <chr> "Fungi", "Bacteria", "Bacteria", "(unknown kingdom)", "Fung…
## $ phylum     <chr> "(unknown phylum)", "(unknown phylum)", "(unknown phylum)",…
## $ class      <chr> "(unknown class)", "(unknown class)", "(unknown class)", "(…
## $ order      <chr> "(unknown order)", "(unknown order)", "(unknown order)", "(…
## $ family     <chr> "(unknown family)", "(unknown family)", "(unknown family)",…
## $ genus      <chr> "(unknown genus)", "(unknown Gram-negatives)", "(unknown Gr…
## $ species    <chr> "(unknown species)", "(unknown species)", "(unknown species…
## $ subspecies <chr> "(unknown subspecies)", "(unknown subspecies)", "(unknown s…
## $ rank       <chr> "species", "species", "species", "(unknown rank)", "species…
## $ ref        <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Tahon et al., 2018", "", "…
## $ species_id <chr> "", "", "", "", "", "", "", "", "797965", "9164ea7340beaa54…
## $ source     <chr> "manually added", "manually added", "manually added", "manu…
## $ prevalence <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,…
## $ snomed     <list> <>, <>, <>, <>, <>, <>, <>, <>, <>, <>, <>, <>, <>, <>, "1…
glimpse(antibiotics)
## Rows: 456
## Columns: 14
## $ ab            <ab> "AMA", "FCT", "ACM", "ASP", "ALS", "AMK", "AKF", "AMX", "…
## $ atc           <chr> "J04AA01", "D01AE21", NA, NA, "J04BA03", "J01GB06", NA, …
## $ cid           <dbl> 4649, 3366, 6450012, 49787020, 8954, 37768, NA, 33613, 2…
## $ name          <chr> "4-aminosalicylic acid", "5-fluorocytosine", "Acetylmide…
## $ group         <chr> "Antimycobacterials", "Antifungals/antimycotics", "Macro…
## $ atc_group1    <chr> "Drugs for treatment of tuberculosis", "Antifungals for …
## $ atc_group2    <chr> "Aminosalicylic acid and derivatives", "Other antifungal…
## $ abbreviations <list> "", <"5flc", "fluo">, "", "", "", <"ak", "ami", "amik",…
## $ synonyms      <list> <"aminopar", "aminosalicylic", "aminosalicylic acid", "…
## $ oral_ddd      <dbl> 12.00, NA, NA, NA, 0.33, NA, NA, 1.50, 1.50, NA, NA, NA,…
## $ oral_units    <chr> "g", NA, NA, NA, "g", NA, NA, "g", "g", NA, NA, NA, "g",…
## $ iv_ddd        <dbl> NA, NA, NA, NA, NA, 1.0, NA, 3.0, 3.0, NA, 35.0, NA, 6.0…
## $ iv_units      <chr> NA, NA, NA, NA, NA, "g", NA, "g", "g", NA, "mg", NA, "g"…
## $ loinc         <list> <>, <"10974-4", "23805-5", "25142-1", "25143-9", "3639-…
glimpse(example_isolates)
## Rows: 2,000
## Columns: 49
## $ date            <date> 2002-01-02, 2002-01-03, 2002-01-07, 2002-01-07, 2002-…
## $ hospital_id     <fct> D, D, B, B, B, B, D, D, B, B, D, D, D, D, D, B, B, B, …
## $ ward_icu        <lgl> FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TR…
## $ ward_clinical   <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ ward_outpatient <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ age             <dbl> 65, 65, 45, 45, 45, 45, 78, 78, 45, 79, 67, 67, 71, 71…
## $ gender          <chr> "F", "F", "F", "F", "F", "F", "M", "M", "F", "F", "M",…
## $ patient_id      <chr> "A77334", "A77334", "067927", "067927", "067927", "067…
## $ mo              <mo> "B_ESCHR_COLI", "B_ESCHR_COLI", "B_STPHY_EPDR", "B_STPH…
## $ PEN             <rsi> R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, R, …
## $ OXA             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ FLC             <rsi> NA, NA, R, R, R, R, S, S, R, S, S, S, NA, NA, NA, NA, …
## $ AMX             <rsi> NA, NA, NA, NA, NA, NA, R, R, NA, NA, NA, NA, NA, NA, …
## $ AMC             <rsi> I, I, NA, NA, NA, NA, S, S, NA, NA, S, S, I, I, R, I, …
## $ AMP             <rsi> NA, NA, NA, NA, NA, NA, R, R, NA, NA, NA, NA, NA, NA, …
## $ TZP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ CZO             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ FEP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ CXM             <rsi> I, I, R, R, R, R, S, S, R, S, S, S, S, S, NA, S, S, R,…
## $ FOX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ CTX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S, …
## $ CAZ             <rsi> NA, NA, R, R, R, R, R, R, R, R, R, R, NA, NA, NA, S, S…
## $ CRO             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S, …
## $ GEN             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TOB             <rsi> NA, NA, NA, NA, NA, NA, S, S, NA, NA, NA, NA, S, S, NA…
## $ AMK             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ KAN             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TMP             <rsi> R, R, S, S, R, R, R, R, S, S, NA, NA, S, S, S, S, S, R…
## $ SXT             <rsi> R, R, S, S, NA, NA, NA, NA, S, S, NA, NA, S, S, S, S, …
## $ NIT             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ FOS             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ LNZ             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R,…
## $ CIP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, S, S, NA, NA, NA, NA, …
## $ MFX             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ VAN             <rsi> R, R, S, S, S, S, S, S, S, S, NA, NA, R, R, R, R, R, S…
## $ TEC             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R,…
## $ TCY             <rsi> R, R, S, S, S, S, S, S, S, I, S, S, NA, NA, I, R, R, S…
## $ TGC             <rsi> NA, NA, S, S, S, S, S, S, S, NA, S, S, NA, NA, NA, R, …
## $ DOX             <rsi> NA, NA, S, S, S, S, S, S, S, NA, S, S, NA, NA, NA, R, …
## $ ERY             <rsi> R, R, R, R, R, R, S, S, R, S, S, S, R, R, R, R, R, R, …
## $ CLI             <rsi> R, R, NA, NA, NA, R, NA, NA, NA, NA, NA, NA, R, R, R, …
## $ AZM             <rsi> R, R, R, R, R, R, S, S, R, S, S, S, R, R, R, R, R, R, …
## $ IPM             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, S, S, …
## $ MEM             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ MTR             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ CHL             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ COL             <rsi> NA, NA, R, R, R, R, R, R, R, R, R, R, NA, NA, NA, R, R…
## $ MUP             <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ RIF             <rsi> R, R, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, R, R, R,…
glimpse(WHONET)
## Rows: 500
## Columns: 53
## $ `Identification number`            <chr> "fe41d7bafa", "91f175ec37", "cc4015…
## $ `Specimen number`                  <int> 1748, 1767, 1343, 1894, 1739, 1846,…
## $ Organism                           <chr> "SPN", "eco", "eco", "MAP", "PVU", …
## $ Country                            <chr> "Belgium", "The Netherlands", "The …
## $ Laboratory                         <chr> "National Laboratory of Belgium", "…
## $ `Last name`                        <chr> "Abel", "Delacroix", "Steensen", "B…
## $ `First name`                       <chr> "B.", "F.", "F.", "L.", "W.", "J.",…
## $ Sex                                <chr> "F", "M", "M", "M", "M", "F", "F", …
## $ Age                                <dbl> 68, 89, 85, 62, 86, 53, 77, 53, 63,…
## $ `Age category`                     <chr> "55-74", "75+", "75+", "55-74", "75…
## $ `Date of admission`                <date> 2005-01-12, 2006-07-30, 2014-03-05…
## $ `Specimen date`                    <date> 2005-01-30, 2006-08-16, 2014-03-14…
## $ `Specimen type`                    <chr> "Urine", "Urine", "Urine", "Urine",…
## $ `Specimen type (Numeric)`          <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ Reason                             <chr> "Unknown", "Unknown", "Unknown", "U…
## $ `Isolate number`                   <int> 1748, 1767, 1343, 1894, 1739, 1846,…
## $ `Organism type`                    <chr> "Bacteria", "Bacteria", "Bacteria",…
## $ Serotype                           <chr> "", "", "", "", "", "", "", "", "",…
## $ `Beta-lactamase`                   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ ESBL                               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ Carbapenemase                      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ `MRSA screening test`              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ `Inducible clindamycin resistance` <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ Comment                            <chr> "", "", "", "", "", "", "", "", "",…
## $ `Date of data entry`               <date> 2005-01-30, 2006-08-16, 2014-03-14…
## $ AMP_ND10                           <rsi> S, NA, S, R, R, S, NA, NA, R, NA, S…
## $ AMC_ED20                           <rsi> S, S, S, NA, R, S, S, S, R, S, S, I…
## $ TZP_ED30                           <rsi> S, NA, S, NA, S, S, S, NA, S, NA, S…
## $ FEP_ED30                           <rsi> NA, NA, NA, NA, NA, NA, S, NA, S, N…
## $ CTX_ED5                            <rsi> NA, NA, NA, NA, S, S, S, NA, R, NA,…
## $ FOX_ED30                           <rsi> NA, NA, NA, NA, R, S, S, NA, R, NA,…
## $ CAZ_ED10                           <rsi> R, R, NA, R, S, S, S, R, S, R, S, S…
## $ CRO_ED30                           <rsi> NA, NA, NA, NA, S, S, S, NA, R, NA,…
## $ CIP_ED5                            <rsi> NA, NA, NA, NA, S, S, S, S, S, NA, …
## $ AMK_ED30                           <rsi> NA, NA, NA, NA, NA, NA, NA, NA, S, …
## $ GEN_ED10                           <rsi> R, S, NA, R, S, S, S, S, S, S, S, S…
## $ TOB_ED10                           <rsi> R, NA, NA, R, S, S, S, NA, S, NA, S…
## $ SXT_ED1.2                          <rsi> S, R, NA, NA, R, S, S, S, R, S, S, …
## $ IPM_ND10                           <rsi> NA, NA, NA, NA, I, S, S, NA, S, NA,…
## $ PEN_ND1                            <rsi> S, R, S, R, R, R, R, R, R, R, R, R,…
## $ AMP_ND2                            <rsi> S, NA, S, R, R, S, NA, NA, R, NA, S…
## $ AMC_ND2                            <rsi> S, S, S, NA, R, S, S, S, R, S, S, I…
## $ CHL_ND30                           <rsi> NA, NA, NA, NA, NA, NA, NA, NA, R, …
## $ VAN_ED5                            <rsi> S, S, NA, NA, R, R, R, S, R, S, R, …
## $ OXA_ED1                            <rsi> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ERY_ED15                           <rsi> S, S, NA, NA, R, R, R, S, R, S, R, …
## $ CLI_ED2                            <rsi> NA, S, S, S, NA, NA, NA, NA, NA, NA…
## $ TCY_ED30                           <rsi> S, S, NA, NA, R, NA, NA, R, R, S, R…
## $ RIF_ED5                            <rsi> NA, NA, NA, NA, R, R, R, NA, R, NA,…
## $ PEN_EE                             <rsi> S, R, S, R, R, R, R, R, R, R, R, R,…
## $ AMP_EE                             <rsi> S, NA, S, R, R, S, NA, NA, R, NA, S…
## $ CRO_EE                             <rsi> NA, NA, NA, NA, S, S, S, NA, R, NA,…
## $ CIP_EE                             <rsi> NA, NA, NA, NA, S, S, S, S, S, NA, …


Question 1

Which Phylum of Fungi is most prevalent within the microorganisms dataset?

Methods

ggthemr("light") #Set theme
  microorganisms%>% #Main Pipe
    filter(kingdom == "Fungi")%>% #Subset data to only keep Fungi
    select(phylum)%>% #Keep only phylums
    count(phylum)%>% #Count Phylums
    ggplot()+
    aes(y=reorder(phylum, +n), #Map y-axis and reorder phylums descendingly 
        x=n, #Map counts to x-axis
        fill = phylum)+ #Fill based on phylum
    geom_col()+
    scale_x_log10()+ #Change x to a log scale
    labs(y = "Phylums", #X-axis label
         x = "Count",  #Y-axis label
         title = "Fungi Phylums examined within microorganisms dataset")+ #Add title
    theme(legend.position = "none", #Remove legend
          plot.background = element_rect(fill = "#f7f7f7")) -> Plot1 #Change background color
  ggplotly(Plot1, tooltip = c("n", "phylum")) #Change hover label to only n and phylum

Answer

The Fungi Phylum Ascomycota seems to be the most prevalent within this dataset.



Question 2

Is there a linear relationship between oral and iv dosages of antibiotics?

Methods

ggthemr("light") #Set theme
  ggplot(antibiotics)+ #Use antibiotics dataset
    aes(x = oral_ddd, #Map x-axis to oral dosages
        y = iv_ddd)+ #Map y-axis to iv dosages
    geom_point()+ #Create scatterplot
    labs(y = "IV Dosage", #Y-axis label
         x = "Oral Dosage", #X-axis label
         title = "Oral vs IV Dosgaes of Antibiotics")+ #Add title
    scale_x_log10()+ #Log scale x-axis
    scale_y_log10()+ #Log scale y-axis
    theme(plot.background = element_rect(fill = "#f7f7f7")) -> Plot2 #Change background color
  ggplotly(Plot2)

Answer

Yes, we can observe a positive linear relationship between oral and iv dosages of antibiotics.



Question 3

In the future, do we expect Gram-positive Bacteria to become more or less resistant to the antibiotic Doxycycline?

Methods

ggthemr("light") #Set theme
  example_isolates %>% #Create main pipe
    filter(mo_gramstain(mo, language = NULL) == "Gram-positive")%>% #Subset to Gram positive bacteria
    resistance_predict(col_ab = "DOX", #Predict resistance on Doxycycline
                       col_date = "date",
                       model = "binomial",
                       info = FALSE,
                       minimum = 15)%>%
    ggplot() +
    aes(x = year, #Map x-axis to year
        y = value)+ #Map y-axis to resistance value
    geom_col() + #Create barplot
    geom_errorbar(aes(ymin = se_min, #lower bar is the min standard error
                      ymax = se_max), #higher bar is the max standard error
                  color = "#31a183", #Change color
                  width=0.6) + #Change the width of bars
    scale_y_continuous(limits = c(0, 1), #limits of y-axis
                       breaks = seq(0, 1, 0.1), #Add a breakage each 0.1
                       labels = paste0(seq(0, 100, 10), "%")) + #Change y-axis labels to percentages
    labs(title = "Forecast of Doxycycline Resistance in Gram-positive Bacteria", #Add title
         y = "%R", #Label y-axis
         x = "Year")+ #Label x-axis
    theme(plot.background = element_rect(fill = "#f7f7f7"))-> Plot3 #Change background color 
  ggplotly(Plot3)

Answer

Based on this plot we can conclude that resistance against Doxycycline will increase in Gram positive bacteria in the coming years.



Question 4

Geographically, where was the data from the WHONET dataset compiled?

Methods

  img <- image_url("blue-marble") #texture for globe
  
  WHONET%>% #Create main pipe
    select(Country)%>% #Subset to country
    mutate(code = case_when( #Insert new code column
      Country == "Belgium" ~ "BEL", #Change Belgium to BEL in code col
      Country == "The Netherlands" ~ "NLD", #Change The Netherlands to NLD in code col
      Country == "Denmark" ~ "DNK", #Change Denmark to DNK in code col
      Country == "France" ~ "FRA", #Change France to FRA in code col
      Country == "Germany" ~ "DEU"))%>% #Change Germany to DEU in code col
    count(code) -> Updated_countries # count countries in code col
  
  create_globe()%>%
    globe_img_url(img) %>% #Change globe texture to img texture
    globe_choropleth(
      data = Updated_countries, #Use Updated_countries dataset
      coords(country = code, #Plot coordinates using code col
             cap_color = n, #Color countries based on count
             altitude = n))%>% #Elevate countries based on count
    globe_background("#f7f7f7")%>% #Change background color
    scale_choropleth_cap_color()%>% #Include cap color
    scale_choropleth_altitude(0.06, 0.1) #Reduce altitude scale

Answer

The data in the WHONET dataset seems to have been compiled in locations from central and northern Europe.